import numpy as np
import pandas as pd
import string
import random
import re
import pprint
import math
from time import time
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
The below piece of code is just to avoid the scroll in the subsequent plots, because there are going to be multiple figures with a number of rows, each row with a couple of plots
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}
We register the time, to see how long it takes to execute the entire notebook.
# Register the time when we start executing the notebook
tx = time()
We have cleaned up manually the file containing the papers. As a result, the file `cleanup_papers_abstract.csv` has been created and it is going to be used in the entire notebook.
The cleanup process has consisted of removing the lines that did not have values in any of the essential columns: `id`, `year`, `title` and `paper_text`. Also, for all the values in `paper_text`, we have removed the information what came before the abstract, as normally it is personal information related to the author.
Below we can see that we have kept an important number of papers to analysis (over 7000), enough to produce the analysis.
# Read the csv file from disk
papers_df = pd.read_csv("cleanup_papers_abstract.csv")
# Print the dimensions of the dataframe and the first rows to see that everything is OK
print("Number of papers to analyze: " + str(papers_df.shape[0]))
papers_df.head(n=10)
We are dropping the columns `id`, `event_type`, `pdf_name` and `abstract` from our analysis.
For the rest of the notebook we are keeping 3 columns: `year`, `title` and `paper_text`.
In order to process the information easily, we are keeping each column in a diferent dataset.
# Keep only the year, title and paper text from the original dataframe
papers_filtered = papers_df.drop(['id', 'event_type', 'pdf_name', 'abstract'], axis=1)
# Convert title and paper_text to lower case
title_lc = [title.lower() for title in papers_filtered.title]
paper_text_lc = [paper_text.lower() for paper_text in papers_filtered.paper_text]
years = [year for year in papers_filtered.year]
Below there are functions that we are using throughout the notebook to plot the results. We are explaining them separately.
# This is just a helper function to get a plot which is centered on the origin in Matplotlib
set2_colors = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854','#ffd92f','#e5c494','#b3b3b3']
light_grey = np.array([float(248)/float(255)]*3)
shade_black = '#262626'
def cible_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Make a target axis at 0,0 with ticks along the axis lines
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.xaxis.set_ticks_position('bottom')
ax.spines['bottom'].set_position(('data',0))
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data',0))
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
The 2 functions below are meant to plot a dataframe whose first column is the `year` and the remaining columns are scores of terms (words, n-grams, concepts, topics, etc).
The purpose of the plots is to see by eye the evolution of each term throughout the time, and identify more easily the presence of a trend.
The intent of plotting all the terms together is to see if there are outstanding terms.
The intent of plotting each term indivivually is to have a zoom of the behaviour of the term throughout the time. In the individual plot, also, it is plotted the regression line, as a way to identify the trend.
Finally, in the individual plot, there is another regression line, only taking into consideration the last 5 years, to see if there is a more localized trend.
# Function to display in 1 plot all the terms in the dataframe, to be analyzed.
# Years are displayed in the X axis and each term is displayed as a new line in the plot
def plot_term_df_all_together(term_df, term_name = "Words"):
# Set the figure size
plt.figure(figsize=(20,12))
# The column year is our X axis
# Each of the rest of the columns is a term. We are plotting each term in a different line
year_list = term_df['Year']
term_list = list(term_df.columns.values)[1:]
for term in term_list:
term_distr = term_df[term]
plt.plot(year_list, term_distr, label=term)
plt.legend(loc='upper right', frameon=True, framealpha=0.6,
scatterpoints=1, bbox_to_anchor=(1.30, 1), title=term_name, fontsize=10)
plt.xlabel('Year', fontsize=15)
plt.ylabel('TOP ' + str(len(term_list)) + ' ' + term_name, fontsize=15)
plt.show()
# Function to plot individually all the terms included in the dataframe
# Years are displayed in the X axis and each term is displayed as a new plot
# Each plot also displays a regression line that fits the data for all the years
# Finally, the plot displays a second regression line, just for the last 5 years, by default
# to be able to indentified more recent trends
def plot_term_df_individually(term_df, term_name = "Word", trend_years=5):
# List with the slopes per term
term_slope_list = []
# Number of terms to plot, from column 1 to the end
term_list = list((term_df.columns.values)[1:])
# Calculations to display the same range for all the plots
# to help the comparison among plots
max_values = term_df.iloc[:,1:]
max_value = max(max_values.max(axis=0))
# Number of rows to plot, considering that we are going to plot 2 columns per row
figure_rows = math.ceil(len(term_list) / 2)
# Create the subplots and set the size of the image
figure, ax_arr = plt.subplots(figure_rows, 2)
figure.set_size_inches(20, figure_rows * 6)
# Get a flat vector of plots, to help the access
ax_vector = np.ravel(ax_arr)
# Vector containing the years
term_df = term_df.fillna(value=0.0)
year_list = list(term_df['Year'])
# Iterate through the list of topics
for i, term in enumerate(term_list):
# Access to the data of the term
term_distr = term_df[term]
# Access to the corresponding subplot, set the range in the y-axis
current_ax = ax_vector[i]
current_ax.set_ylim([0, max_value])
current_ax.set_xlabel("Year", fontsize=10)
current_ax.set_title(term_name + "=" + str(term), fontsize=15)
# Plot the topic data
current_ax.plot(year_list, term_distr, label="Score for \""+ term + "\"")
# Number of years for a trend
trend = trend_years
# Calculating the 2 regression lines
z = np.polyfit(year_list, term_distr, 1)
z_trend = np.polyfit(year_list[len(year_list) - trend - 1:], term_distr[len(year_list) - trend - 1:], 1)
p = np.poly1d(z)
p_trend = np.poly1d(z_trend)
x1 = year_list[0]
x2 = year_list[len(year_list)-1]
x3 = year_list[len(year_list)- trend - 1]
x4 = year_list[len(year_list)-1]
y1 = p(x1)
y2 = p(x2)
y3 = p_trend(x3)
y4 = p_trend(x4)
# Calculating the slope
slope = (y2 - y1)/(x2-x1)
slope_trend = (y4 - y3)/(x4-x3)
term_slope_list.append((term, np.mean(term_distr), slope, slope_trend))
# Plot the 2 regression lines
current_ax.plot(year_list,p(year_list),
"r--", label="Slope for " + str(len(year_list)) + " years = " + str(slope))
current_ax.plot(year_list[len(year_list) - trend - 1:],p_trend(year_list[len(year_list)- trend - 1:]),
"-.", label="Slope for the last " + str(trend) + " years = " + str(slope_trend), color="black")
current_ax.legend(loc='upper right',frameon=True,framealpha=0.6, scatterpoints=1,
bbox_to_anchor=(0.75, 1))
plt.show()
term_slope_df = pd.DataFrame(term_slope_list, columns=['Term', "Mean score", 'Total slope', "Last years slope"])
return term_slope_df
Below there are functions to process the corpora:
-`num_and_short_word_preprocessor` removes numbers, 1-letter and 2-letter words.
-`create_stop_words` uses the stop words from nltk and also adds a list of more words derived from an iterative analysis of the most frequent words in the corpora.
-`custom_tokenizer` breaks down the corpora into words and removes the plural of words.
-`preprocess_corpora` is just a pipeline of the above functions.
# Function to remove numbers and small words (1 or 2 letters) from a document
def num_and_short_word_preprocessor(tokens):
# Regular expression for numbers
no_numbers = re.sub('(\d)+', '', tokens.lower())
# Regular expression for 1-letter and 2-letter words
no_short_words = re.sub(r'\b\w{1,2}\b', '', no_numbers)
return no_short_words
# Function to return a list of stop words to consider
def create_stop_words():
# We create a stop word list
stops = set(stopwords.words("english"))
# We define individual numbers and letters as stop words
all_letters_numbers = string.digits + string.ascii_letters
stops = stops.union(['abstract', 'using', 'based', 'via', 'data', 'log', 'upper', 'lower',
'mit', 'press', 'shown', 'figure', 'set', 'information', 'processing',
'step', 'basis', 'error', 'rate', 'ha', 'two', 'one', 'also', 'given',
'example', 'number', 'used', 'wa', 'weight', 'point', 'case', 'result',
'show', 'value', 'input', 'output', 'different', 'first', 'use']) # add some stopwords
stops = stops.union(list(all_letters_numbers))
return stops
# Function to convert plurals into singulars
def custom_tokenizer(doc):
word_tokenizer = RegexpTokenizer(r'\w+')
tokens = word_tokenizer.tokenize(doc)
wnl = WordNetLemmatizer()
singular_tokens = [wnl.lemmatize(i) for i in tokens]
return singular_tokens
def preprocess_corpora(corpora):
no_num_short_word_corpora = [num_and_short_word_preprocessor(sentence) for sentence in corpora]
stemmed_corpora = [custom_tokenizer(doc) for doc in no_num_short_word_corpora]
processed_corpora = []
for doc in stemmed_corpora:
processed_corpora.append([word for word in doc if not word in stop_words])
return processed_corpora
The first approach we are using is Bag of Words (BOW). To do that, we use the class CountVectorizer and we do an analysis of 1-gram and 2-grams terms, separately, to see the most frequent terms in each of this two categories.
Following are the key aspects of the BOW analysis:
-Frequency of terms is analyzed per year.
-Normalization is done by dividing the number of occurrences of a term into the ratio (#words/#documents) in the year. That way we normalize taking into account the total number of words in a year and the number of documents in a year.
-Localized frequency: the algorithm selects the TOP most frequent terms in the whole corpora and also the TOP most frequent terms each year to be able to perform a more comprehensive analaysis by the plots. This way, if a term is very frequent a given year, but it is not anylonger, this term is going to be considered in the whole analysis.
</span>
def create_bow(corpora, stop, ngram = 1):
vectorizer = CountVectorizer(stop_words = stop, ngram_range = (ngram, ngram),
preprocessor = num_and_short_word_preprocessor, tokenizer=custom_tokenizer)
vec = vectorizer.fit(corpora)
bow = vec.transform(corpora)
#print(bow)
return vec, bow
def get_bow_per_year(years, corpora, stop = [], ngram = 1, num_freq_words=5, num_freq_words_year=5,
normalice=False, debug = False):
# Run the BOW for the corpora
vec, bow = create_bow(corpora, stop, ngram)
# List to store the frequency of words per year
frequent_words_per_year = []
# Create a list with the unique set of years
year_list = list(sorted(set(years)))
# Iterate for each year
for year in year_list:
# List of the indexes that contains the row numbers of a given year
idx = [i for i, x in enumerate(years) if x == year]
# Sub-matrix with the ocurrences for a given year
documents_in_a_year = bow[idx,]
# Total number of words for a given year, to be used to normalize
total_words_year = documents_in_a_year.sum()
num_documents = len(idx)
# Vector that contains the sum for each word
sum_word_matrix = documents_in_a_year.sum(axis=0)
# If we want to normalize, the sum of terms in a year is divided by ratio of the number of terms
# in all the documents of the year per number of documents
if normalice:
sum_word_matrix = sum_word_matrix / (total_words_year / num_documents)
sum_word_list = np.array(sum_word_matrix)[0].tolist()
# Append the sum of frequencies of a given year
frequent_words_per_year.append(sum_word_list)
if debug:
print("Year = " + str(year))
print("Documents in the year = " + str(len(idx)))
print("Total words in the year = " + str(total_words_year))
#print(words_freq[0:num_freq_words])
print("\n")
# Create a dataframe with the years as rows and the sorted vocabulary as columns
vocabulary = vec.vocabulary_.items()
sorted_vocabulary = sorted(vocabulary, key = lambda x: x[0], reverse=False)
sorted_words = [key for key, value in sorted_vocabulary]
bow_per_year_df = pd.DataFrame(frequent_words_per_year, columns=sorted_words)
# Fill the set with the most frequent terms per year
freq_words_set = set()
# Iterate for each year to get the X most frequent words per year
for index, row in bow_per_year_df.iterrows():
sorted_freq_words_year = row.sort_values(ascending=False)
most_frequest_word_year_list = sorted_freq_words_year.index[0:num_freq_words_year]
freq_words_set.update(most_frequest_word_year_list)
# Calculate the sum of each word (sum per column), and order the Series in descending order
freq_words = bow_per_year_df.sum(axis=0)
sorted_freq_words = freq_words.sort_values(ascending=False)
# Keep the most frequent and store them as a set
most_freq_words_corpora_list = sorted_freq_words.index[0:num_freq_words]
most_freq_words_corpora_set = set(most_freq_words_corpora_list)
most_freq_words_corpora_set.update(freq_words_set)
bow_per_year_df = bow_per_year_df[list(most_freq_words_corpora_set)]
bow_per_year_df.insert(loc=0, column='Year', value=year_list)
return bow_per_year_df
stop_words = create_stop_words()
In this section we are analyzing 1-gram terms (words), keeping the 25 most frequent words during the 31 years, plus adding the most frequent word (if it is not included yet) per year.
We are normalizing following the criteria explain at the beginning of the section.
t0 = time()
word_df = get_bow_per_year(years, paper_text_lc, stop = stop_words, num_freq_words=25, num_freq_words_year=1,
ngram = 1, debug = False, normalice = True)
print("done in %0.3fs." % (time() - t0))
Below we can see the score (normalized occurrences) of the most frequent words.
plot_term_df_all_together(word_df, term_name = "Words")
Below we can see the individual plot for each word.
term_slope_df = plot_term_df_individually(word_df, term_name = "BOW Word")
In the below 3 tables we can see the following facts, after preprocessing and normalizing:
-The most frequent words are `model`, `algorithm`, `function`, `learning` and `network`.
-The words with better trend in 31 years are `algorithm`, `method`, `function`, `problem` and `distribution`.
-The words that show better trend in the last 5 years are `network`, `model`, `algorithm`, `function` and `learning`.
term_slope_df.sort_values(by='Mean score', ascending = False).iloc[:,[0,1]]
term_slope_df.sort_values(by='Total slope', ascending = False).iloc[:,[0,2]]
term_slope_df.sort_values(by='Last years slope', ascending = False).iloc[:,[0,3]]
In this section we are analyzing 2-gram terms, keeping the 25 most frequent terms during the 31 years, plus adding the most frequent terms (if not included yet) per year.
We are normalizing following the criteria explain at the beginning of the section.
t0 = time()
word_df = get_bow_per_year(years, paper_text_lc, stop = stop_words, num_freq_words=25, num_freq_words_year=1,
ngram = 2, debug = False, normalice = True)
print("done in %0.3fs." % (time() - t0))
Below we can see the score (normalized occurrences) of the most frequent 2-grams.
plot_term_df_all_together(word_df, term_name = "2-grams")
Below we can see the individual plot for each 2-gram.
term_slope_df = plot_term_df_individually(word_df, term_name = "BOW 2-gram")
In the below 3 tables we can observe the following facts:
-The most frequent 2-grams are `neural network`, `machine learning`, `learning algorithm`, `neural system` and `loss function`.
-The 2-grams with better trend in 31 years are `machine learning`, `loss function`, `optimization problem`, `state art` and `random variable`.
-The words that show better trend in the last 5 years are `neural network`, `neural system`, `gradient descent`, `state art` and `loss function`.
-We can observe the hype that `neural network` had, then it declined but it has gone up again very strongly in the last 5 years.
term_slope_df.sort_values(by='Mean score', ascending = False).iloc[:,[0,1]]
term_slope_df.sort_values(by='Total slope', ascending = False).iloc[:,[0,2]]
term_slope_df.sort_values(by='Last years slope', ascending = False).iloc[:,[0,3]]
The second approach we are using is Term frequency–inverse document frequency (TF-IDF). To do that, we use the class TfidfVectorizer and we do an analysis of 1-gram and 2-grams terms, separately, to see the most frequent terms in each of this two categories.
Following are the key aspects of the TF-IDF analysis:
-Frequency of terms is analyzed considering the documents for each year separatelly.
-Normalization is done by dividing the TF-IDF score into the ratio (#words/#documents) in the year. That way we normalize taking into account the total number of words in a year and the number of documents in a year.
-Localized frequency: the algorithm selects the TOP most frequent terms in the whole corpora and also the TOP most frequent terms each year to be able to perform a more comprehensive analaysis by the plots. This way, if a term is very frequent a given year, but it is not anylonger, this term is going to be considered in the whole analysis.
</span>
def recursive_len(item):
if type(item) == list:
return sum(recursive_len(subitem) for subitem in item)
else:
return 1
def create_tf_idf(corpora, stop, ngram = 1):
vectorizer = TfidfVectorizer(stop_words = stop, ngram_range = (ngram, ngram),
preprocessor = num_and_short_word_preprocessor, tokenizer=custom_tokenizer)
vec = vectorizer.fit(corpora)
bow = vec.transform(corpora)
return vec, bow
def get_tf_idf_per_year(years, corpora, stop = [], ngram = 1, num_freq_words=5, num_freq_words_year=5,
normalice=False, debug = False):
# Create a list with the unique years
year_list = list(sorted(set(years)))
tf_idf_per_year_df = pd.DataFrame()
# Iterate for each year
for year in year_list:
# List of the indexes that contains the row numbers of a given year
idx = [i for i, x in enumerate(years) if x == year]
# Sub-vector with the documents for a given year
documents_in_a_year = [corpora[index] for index in idx]
# Total number of words for a given year, to be used to normalize
total_words_year = recursive_len(documents_in_a_year)
num_documents = len(idx)
vec, bow = create_tf_idf(documents_in_a_year, stop, ngram)
# Vector that contains the sum for each word
sum_word_matrix = bow.sum(axis=0)
# If we want to normalize, the sum of terms in a year is divided by ratio of the number of terms
# in all the documents of the year per number of documents
if normalice:
sum_word_matrix = sum_word_matrix / (total_words_year / num_documents)
sum_word_list = np.array(sum_word_matrix)[0].tolist()
# Create a dataframe with the years as rows and the sorted vocabulary as columns
vocabulary = vec.vocabulary_.items()
sorted_vocabulary = sorted(vocabulary, key = lambda x: x[0], reverse=False)
sorted_words = [key for key, value in sorted_vocabulary]
bow_per_year_df = pd.DataFrame(columns=sorted_words)
bow_per_year_df.loc[0] = sum_word_list
tf_idf_per_year_df = tf_idf_per_year_df.append(bow_per_year_df, sort=True)
if debug:
print("Year = " + str(year))
print("Documents in the year = " + str(len(documents_in_a_year)))
print("\n")
tf_idf_per_year_df = tf_idf_per_year_df.fillna(value=0.0)
freq_words_set = set()
# Iterate for each year to get the X most frequent words per year
for index, row in tf_idf_per_year_df.iterrows():
#print(row)
sorted_freq_words_year = row.sort_values(ascending=False)
most_frequest_word_year_list = sorted_freq_words_year.index[0:num_freq_words_year]
#print(most_frequest_word_year_list)
freq_words_set.update(most_frequest_word_year_list)
# Calculate the sum of each word (sum per colum), and order the Series in descending order
freq_words = tf_idf_per_year_df.sum(axis=0)
sorted_freq_words = freq_words.sort_values(ascending=False)
# Keep the most frequent and store them as a set
most_freq_words_corpora_list = sorted_freq_words.index[0:num_freq_words]
most_freq_words_corpora_set = set(most_freq_words_corpora_list)
most_freq_words_corpora_set.update(freq_words_set)
tf_idf_per_year_df = tf_idf_per_year_df[list(most_freq_words_corpora_set)]
tf_idf_per_year_df.insert(loc=0, column='Year', value=year_list)
return tf_idf_per_year_df
In this section we are analyzing 1-gram terms (words), keeping the 25 most frequent words during the 31 years, plus adding the most frequent word (if it is not included yet) per year.
We are normalizing following the criteria explain at the beginning of the section.
t0 = time()
tf_idf_word_df = get_tf_idf_per_year(years, paper_text_lc, stop = stop_words, num_freq_words=25,
ngram = 1, debug = False, normalice=True)
print("done in %0.3fs." % (time() - t0))
Below we can see the score (normalized occurrences) of the most frequent words.
plot_term_df_all_together(tf_idf_word_df, term_name = "Words")
Below we can see the individual plot for each word.
term_slope_df = plot_term_df_individually(tf_idf_word_df, term_name = "TD-IDF Word")
In the below 3 tables we can see the some facts, always after preprocessing and normalizing:
-The most frequent words are `model`, `algorithm`, `function`, `learning` and `network`.
-The words with better trend in 31 years are `model`, `algorithm`, `matrix`, `function` and `method`.
-The words that show better trend in the last 5 years are `network`, `algorithm`, `model`, `function` and `learning`.
term_slope_df.sort_values(by='Mean score', ascending = False).iloc[:,[0,1]]
term_slope_df.sort_values(by='Total slope', ascending = False).iloc[:,[0,2]]
term_slope_df.sort_values(by='Last years slope', ascending = False).iloc[:,[0,3]]
By far, this analysis has been the one to take longer in terms of execution time, and it is because there is a method that appends a DataFrame to another DataFrame and this operation is really time consuming.
In the 2-gram analysis we are selecting the 25 most frequent terms during the 31 years, plus the most frequent term each year (in the case it is not among the TOP 25 most frequent)
We are normalizing following the criteria explain at the beginning of the section.
t0 = time()
tf_idf_word_df = get_tf_idf_per_year(years, paper_text_lc, stop = stop_words, num_freq_words=25,
ngram = 2, debug = False)
print("done in %0.3fs." % (time() - t0))
plot_term_df_all_together(tf_idf_word_df, term_name = "2-grams")
1) Neural Network
2) Gradient descent
3) Generative models
4) Stochastic gradient
5) Machine learning
6) Latent variables
7) Reinforcement learning
8) Speech recognition
term_slope_df = plot_term_df_individually(tf_idf_word_df, term_name = "TF-IDF 2-gram")
In the below 3 tables we can see the following facts:
-The most frequent words are `neural network`, `machine learning`, `learning algorithm`, `hidden unit` and `loss function`.
-The words with better trend in 31 years are `machine learning`, `low rank`, `loss function`, `latent variable` and `graphical model`.
-The words that show better trend in the last 5 years are `neural network`, `gradient descent`, `generative model`, `stochastic gradient` and `loss function`.
term_slope_df.sort_values(by='Mean score', ascending = False).iloc[:,[0,1]]
term_slope_df.sort_values(by='Total slope', ascending = False).iloc[:,[0,2]]
term_slope_df.sort_values(by='Last years slope', ascending = False).iloc[:,[0,3]]
The third approach is Latent Dirichlet Allocation (LDA). In this case, we are using the library `gensim` to fit the model, as well as to try to extract the appropriate number of concepts and their corresponding words in the documents.
Initially, we are executing a validation to find the correct number of concepts. Through a different validation, we had adjusted the parameters `alfa` and `eta` to `50.0/1.0*np.ones((num_topics))` and `0.1`. Considering those values for alfa and eta, we are going to fit a model using from 10 to 40 concepts, in incrementals of 5. For each model, given a number of concepts, we are calculating the `Coherence` score (also provided by gensim`, and plotting the results to see by the eye the most appropriate.
We are not using the entire corpora of documents for this number of concepts tuning, as it is very time consuming. For the test, we are using 1000 documents, selected randomly.
Once we know the right parameters, we fit a model with the entire corpora.
def test_lda_parameters(dictionary, corpus, texts, limit, start=2, step=3):
coherence_values = []
model_list = []
# Iterate through the range of concepts to validate
for num_topics in range(start, limit, step):
model = models.ldamodel.LdaModel(corpus, num_topics=num_topics,
id2word = dictionary, passes=100,alpha=50.0/1.0*np.ones((num_topics)), eta=0.1,
random_state=9999)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
num_docs = 500
start = 10
limit = 40
step = 5
# We select a list of random documents to run the test
random_documents = random.sample(paper_text_lc, num_docs)
# Corpora
processed_corpora = preprocess_corpora(random_documents)
dictionary = corpora.Dictionary(processed_corpora)
corpus = [dictionary.doc2bow(doc) for doc in processed_corpora]
t0 = time()
model_list, coherence_values = test_lda_parameters(dictionary=dictionary, corpus=corpus,
texts=processed_corpora, start=start, limit=limit, step=step)
print("done in %0.3fs." % (time() - t0))
Considering the below plot, the best range to select the number of topics is between 15 and 20. We are going to select 20.
plt.figure(figsize=(12, 8))
plt.title('Coherence score for different topic numbers', fontsize=20)
ax = plt.gca()
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics", fontsize=15)
plt.ylabel("Coherence score", fontsize=15)
plt.show()
The essential part of this approach consist of computing the topic distribution of the documents and words that make up the corpora for each year.
We put together all the documents for a given year and find the topic numbers that have the highest percentage contribution in set of documents.
def get_lda_topic_distr_per_year(years, corpora, lda_model, debug = False):
# Create a list with the unique years
year_list = list(sorted(set(years)))
topics_distr_per_year = []
# Iterate for each year
for year in year_list:
# List of the indexes that contains the row numbers of a given year
idx = [i-1 for i, x in enumerate(years) if x == year]
# Sub-vector with the documents for a given year
documents_in_a_year = [corpora[index] for index in idx]
flat_list = [item for sublist in documents_in_a_year for item in sublist]
if debug:
print("Year = " + str(year))
print("Documents in year = " + str(len(documents_in_a_year)))
year_topic_distribution = lda_model[flat_list]
distr = [d for topic, d in year_topic_distribution]
topics_distr_per_year.append(distr)
topic_info = lda_model.show_topics(num_topics = len(distr), num_words = 5, formatted=False)
topics_words = ['_'.join([wd[0] for wd in tp[1]]) for tp in topic_info]
topic_df = pd.DataFrame(topics_distr_per_year, columns = topics_words)
topic_df.insert(loc=0, column='Year', value=year_list)
return topic_df
We fit the model with the entire corpora of documents and setting the number of topics to 15. Then, we obtain the topic distribution per year, considering all the documents that make up that year.
num_topics = 15
#num_docs = 500
#random_documents = paper_text_lc[0:num_docs]
#processed_corpora = preprocess_corpora(random_documents)
# Corpora
processed_corpora = preprocess_corpora(paper_text_lc)
dictionary = corpora.Dictionary(processed_corpora)
corpus = [dictionary.doc2bow(doc) for doc in processed_corpora]
t0 = time()
lda = models.ldamodel.LdaModel(corpus, num_topics=num_topics,
id2word = dictionary, passes=100,alpha=20.0/1.0*np.ones((num_topics)),
eta=0.1, random_state=9999)
print("done in %0.3fs." % (time() - t0))
topic_df = get_lda_topic_distr_per_year(years, corpus, lda, debug = False)
#topic_df = get_lda_topic_distr_per_year(years[0:num_docs], corpus, lda, debug = False)
Below we can see a plot with the score that identifies the contribution of each topic to the sub-corpora of every year.
plot_term_df_all_together(topic_df, "Topics")
1) Object detection using deep learning (coming from the concept made up of image_layer_network_object_training).
2) Markov decision process (coming from the concept state_time_policy_action_reward)
term_slope_df = plot_term_df_individually(topic_df, "LDA Topic")
term_slope_df.sort_values(by='Mean score', ascending = False).iloc[:,[0,1]]
term_slope_df.sort_values(by='Total slope', ascending = False).iloc[:,[0,2]]
term_slope_df.sort_values(by='Last years slope', ascending = False).iloc[:,[0,3]]
print("done in %0.3fs." % (time() - tx))
In this section, we use `Word2Vec` to process all texts in the `paper_text_lc` file. Essentially, `Word2Vec` uses shallow and two-layer neural networks to produce words embedding and to map each word vector to a real number. With these embedded word vectors as the parameters for the hidden layers of neural networks, we can analyze the relationships between the words in the text file. We will conduct the `Wod2Vec` analysis on the `paper_text_lc` file and analyze the top 10 similar words for each word in the `top_words_list`. The `top_words_list` contains 5 very frequent words in the `paper_text_lc` file, including `model`, `algorithm`, `function`, `learning`, and `network`. Note that the concept, `similar words`, here means the words that are semantically close to a given word.
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
stop_words = create_stop_words()
# Use Word2Vec to find the top smilar words of the most frequent words in the paper texts
processed_corpora = preprocess_corpora(paper_text_lc)
model_Word = Word2Vec(processed_corpora, size=200, window=4, min_count=100, workers=4)
Word2Vec_word_vectors = model_Word.wv
def Word2Vec_most_similar_word (word):
result = Word2Vec_word_vectors.most_similar(positive=[word])
print("The top 10 words most similar to " + str(word) + ":")
print(result)
print("\n")
top_words_list = ["model", "algorithm", "function", "learning", "network"]
for word_number in range(0, len(top_words_list)):
Word2Vec_most_similar_word (top_words_list[word_number])
In this section, we use `Doc2Vec` to process all texts in the `paper_text_lc` file. Different from `Word2Vec`, which analyzes similarity between words vectors, `Doc2Vec` tags the text documents (with `TaggedDocument` in the following code) and conducts analysis on tag vectors. In other words, `Doc2Vec` is more powerful that `Word2Vec` because the former vectorizes both words and documents. Here, in the tagged documents, we use `Doc2Vec` to find the top ten most similar words for each word in the same `top_words_list` as last section. The `top_words_list` contains 5 very frequent words in the `paper_text_lc` file, including `model`, `algorithm`, `function`, `learning`. The produced top ten silmilar words for each word using the `Doc2Vec` are similar to the results generated using `Doc2Vec`.
# Use Doc2Vec to find the most smilar words of some
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, TaggedLineDocument
processed_corpora = preprocess_corpora(paper_text_lc)
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(processed_corpora)]
model_Doc = Doc2Vec(documents, vector_size=100, window=4, min_count=100, workers=4)
Doc2Vec_word_vectors = model_Doc.wv
def Doc2Vec_most_similar_word (word):
result = Doc2Vec_word_vectors.most_similar(positive=[word])
print("The top 10 words most similar to " + str(word) + ":")
print(result)
print("\n")
top_words_list = ["model", "algorithm", "function", "learning", "network"] # Apriori Support
for word_number in range(0, len(top_words_list)):
Doc2Vec_most_similar_word (top_words_list[word_number])
In this part, we use `Apriori` to extract frequent itemsets from `title_lc` file. Due to the long computational time of the `paper_text_lc` file in this problem, we will instead use the `title_lc` file here. We will use `min_support=0.03` in the analysis. We first will determine the top 10 frequent words in all tiles for each individual year, and then will construct a list to collect all of the frequent words in all years. For each word in this list, we will list its support value in different year. Then we will obtain a datdaframe of which each column contain the support values (in different years) of an invidual frequent word. We will remove the columns that have only less than 5 nonzero support values. Then we will plot the changes of the frequent words with time. Even the number of word samples in the `title_lc` file is less than that in the `paper_text_lc` file, we still can see that, for some frequent words, such as `network`, the trends of their support values are simialr to the trends of their frequencies.
# For the titles in each invidual year, find the top ten itemsets (words or word combinations)
# having the highest support values.
# Note: "Support" means the ratio of the number of documents containing a particular word to the
# total number of documents.
from mlxtend.preprocessing import TransactionEncoder
TE = TransactionEncoder()
year_list = list(sorted(set(years)))
# Build an index list for years
title_years_index = []
title_years_index_length = []
for year in year_list:
# List of the indexes that contains the row numbers of a given year
idx = [i for i, x in enumerate(years) if x == year]
title_years_index.append(idx)
title_years_index_length.append(len(idx))
#print(title_years_index)
#print(title_years_index_length)
#title_years_index
# We use fit method in TransactionEncoder to learns the unique labels in the dataset
# We use transform method in TransactionEncoder to transform the input dataset into a one-hot encoded boolean array
# Apply to title_tokens list
processed_title = preprocess_corpora(title_lc)
te_ary = TE.fit(processed_title).transform(processed_title)
title_TE = pd.DataFrame(te_ary, columns = TE.columns_)
title_TE.head(10)
from mlxtend.frequent_patterns import apriori
# For the titles in each invidual year, find the top ten itemsets (words or word combinations)
# having the highest support values.
# Note: "Support" means the ratio of the number of documents containing a particular word to the
# total number of documents.
title_frequent_top10_items = []
title_frequent_top10_support = []
all_title_frequent_top10_items = []
all_title_frequent_top10_support = []
for year in year_list:
#print("Year = " + str(year))
#print("Documents in the year = " + str(title_years_index_length[year-1987]) + "\n")
title_frequent_items = apriori(title_TE.loc[title_years_index[year-1987],:], min_support=0.03, use_colnames=True)
title_frequent_items.sort_values('support', inplace=True, ascending=False)
title_frequent_top10_items = title_frequent_items.iloc[:,1].head(10).tolist()
title_frequent_top10_support = title_frequent_items.iloc[:,0].head(10).tolist()
all_title_frequent_top10_items.append([list(x) for x in title_frequent_top10_items])
all_title_frequent_top10_support.append([x for x in title_frequent_top10_support])
# print(title_frequent_items, "\n")
#print(title_frequent_top10, "\n")
#print(all_title_frequent_top10)
#print(year_list)
items_support_list = []
for idx in range(31):
A = list(zip(all_title_frequent_top10_items[idx], all_title_frequent_top10_support[idx]))
items_support_list.append(A)
#print (items_support_list)
dictionary = dict(zip(year_list, items_support_list))
#dictionary
merged_top_words_list = []
for year in year_list:
for idx in range(8):
merged_top_words_list.append([word for word in dictionary[year][idx][0] if not word in merged_top_words_list])
# Change the list containing sublists to one list with flattened_list
flattened_list = [val for sublist in merged_top_words_list for val in sublist]
# Keep one only one copy for the repeated word in the flattened_list
from collections import OrderedDict
top_word_list=list(OrderedDict.fromkeys(flattened_list))
# Initialize a dataframe for the top_word_list at different years
df_columnames = []
for word in top_word_list:
df_columnames.append(word)
df = pd.DataFrame(columns=df_columnames)
# Extract the support values of each words in the top_word_list in the dictionary and collect its support values in
# different years as a column in a dataframe.
word_support_values = []
for year in year_list:
invidual_year_list = []
words_in_invidual_year = []
for idx in range(9):
words_in_invidual_year.append(dictionary[year][idx][0][0])
for word in top_word_list:
if word in words_in_invidual_year:
location = words_in_invidual_year.index(word)
invidual_year_list.append( dictionary[year][location][1] )
else:
invidual_year_list.append(0)
word_support_values.append(invidual_year_list)
for year in year_list:
df.loc[year-1987] = word_support_values[year-1987]
df.insert(loc=0, column='Year', value=year_list)
# Remove the columns in which the number of nonzero elements is less than 5:
delete_word_list = []
for column_name in df.columns:
column_list = df[column_name].tolist()
if np.count_nonzero(column_list) < 5:
delete_word_list.append(column_name)
df.drop(delete_word_list, axis=1, inplace=True)
In the following plots of the `top_word_list` and the tables of `slopes`, we can see the following facts, in the `title_lc` file :
-The five words that have the highest support values are `network`, `neural`, `learning`, `model`, and `algorithm`. This result is very similar to what we got when analyzing the frequencies of words in the much larger `paper_text_lc` file, in which the five words with the highest frequencies are `model`, `algorithm`, `function`, `learning` and `nnetwork`.
-The words with better trend of the support value in 31 years are `learning`, `inference` and `process`, which is NOT the same as the words with the highest frequqncies that we obtained before. Because (1) the samples in the `title_lc` file is much less than that in the `paper_text_lc` file, and (2) `support value` is different from `frequency`. The high support values for `learning`, `inference` and `process` means that the percentage of the titles that contain these three words in all paper titles kept increasing in the past 31 years.
-The words that show better trend in the last 5 years are `network` and `neural`. This indicates that the percentage of the titles that contain `network` and `neural` increases faster than other words.
plot_term_df_all_together(df, term_name = "top_word_list")
plot_term_df_individually(df, term_name = "top_word_list")